# 找出重复评论, 存为tsv
# 先将评论提取成文本再找重复

import pandas as pd

inputfile = r"..\data\microwave_1selected.tsv"  # 筛选列后的数据
outputfile = r"..\data\microwave_2reviews.tsv"  # 提取的评论
outputfile1 = r"..\data\microwave_2duplicate_reviews.tsv"  # 重复的评论

data = pd.read_csv(inputfile, sep='\t', encoding='utf-8')

data_review = data['review_body']  # 获取评论这一列
# l1 = len(data_review)
# data_unique = pd.DataFrame(data_review.unique())  # unique数组去重
# l2 = len(data_unique)
data_review.to_csv(outputfile, sep='\t', index=False, header=True, encoding='utf-8')
# print(u'共%s条评论，删除了%s条评论。' % (l1, l1 - l2))
# print(data_unique)

#  将索引和其重复值转换成了一个dataframe
duplicate_data = pd.DataFrame(data_review.value_counts())  # 计算数组有哪些不同的值，并计算每个值有多少个重复值,原值变成了行索引
m_review = duplicate_data.index.values.tolist()
# print(m_review)
# print('type of m_review,', type(m_review))
m_num = duplicate_data['review_body'].values.tolist()
# print(m_num)
# print('type of m_num,', type(m_num))
# m_dict = {
#     "review_body": m_review,
#     "num": m_num
# }
# print(type(m_dict))
# m_review_num = pd.DataFrame(dict)
m_review_num = pd.DataFrame([m_review, m_num])
m_review_num = pd.DataFrame(m_review_num.values.T)  # 转置
m_review_num.columns = ['review_body', 'num']
m_review_num = m_review_num[m_review_num['num'] > 1]
m_review_num.to_csv(outputfile1, sep='\t', index=False, header=True, encoding='utf-8')
print(m_review_num)









